MetaT - in prep - SKH

Import data from zenodo - required files:

  • Normalized_data_08022018.RData
  • Normed_avg_annotated_08022018.RData
  • ReNorm_bytax_08022018.RData
In [4]:
library(reshape2); library(dplyr)
library(ggplot2); library(pheatmap)
library(vegan);library(cowplot)
library(tidyverse)
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: permute

Loading required package: lattice

This is vegan 2.5-6


********************************************************

Note: As of version 1.0.0, cowplot does not change the

  default ggplot2 theme anymore. To recover the previous

  behavior, execute:
  theme_set(theme_cowplot())

********************************************************


Generate taxa plots

In [3]:
load("Normed_avg_annotated_08022018.RData", verbose=TRUE)
Loading objects:
  avg_CPM
  df_wtax
  df_wKO_wdups
In [ ]:
# Format to plot at Supergroup level
# unique(df_wtax$Supergroup)
df_wtax$sample<-paste(df_wtax$Location, df_wtax$Depth, sep=" ")
#
# Remove unassigned taxonomy.
df_wtax_noNA<-subset(df_wtax, !(Taxonomy %in% "Not assigned"))

# Supergroup level
super<-aggregate(df_wtax_noNA$mean_CPM, by=list(Location=df_wtax_noNA$Location, Depth=df_wtax_noNA$Depth, sample=df_wtax_noNA$sample, Supergroup=df_wtax_noNA$Supergroup), sum)

## Re-compile taxonomy breakdown for visualization purposes
compile_tax<-function(df){
    df$Nextlevel<-df$Phylum
    df$Nextlevel[df$Phylum==""]="Other"
    df$Nextlevel[df$Class=="Foraminifera"]="Foraminifera"
    df$Nextlevel[df$Class=="Acantharia"]="Acantharia"
    df$Nextlevel[df$Class=="Polycystinea"]="Polycystinea" 
    df$Nextlevel[df$Class=="Syndinians"]="Syndiniales" 
    df$Nextlevel[df$Class=="Bacillariophyceae"]="Bacillariophyceae" 
    df$Nextlevel[df$Class=="Pelagophyceae"]="Pelagophyceae" 
  df$tax_compiled<-df$Supergroup
  df$tax_compiled[df$Supergroup == "Alveolate"]<-"Other Alveolate"
  df$tax_compiled[df$Nextlevel=="Ciliate"]="Ciliate"
  df$tax_compiled[df$Nextlevel=="Dinoflagellate"]="Dinoflagellate"
  df$tax_compiled[df$Nextlevel=="Syndiniales"]="Syndiniales"
  #
  df$tax_compiled[df$Supergroup == "Archaeplastida"]<-"Other Archaeplastida"
  df$tax_compiled[df$Nextlevel=="Chlorophyta"]="Chlorophyta"
  #
  #df$tax_compiled[df$Supergroup=="Rhizaria"]<-"Other Rhizaria"
  #df$tax_compiled[df$Nextlevel=="Cercozoa"]="Cercozoa"
  #df$tax_compiled[df$Nextlevel=="Retaria"]="Retaria"
  #df$tax_compiled[df$Nextlevel=="Acantharia"]="Acantharia"
  #df$tax_compiled[df$Nextlevel=="Foraminifera"]="Foraminifera"
  #df$tax_compiled[df$Nextlevel=="Polycystinea"]="Polycystinea"
  #
  df$tax_compiled[df$Supergroup=="Stramenopile"]<-"Other Stramenopile"
  df$tax_compiled[df$Nextlevel=="MAST"]="MAST"
  df$tax_compiled[df$Nextlevel=="Bacillariophyceae"]="Diatom"
  df$tax_compiled[df$Nextlevel=="Pelagophyceae"]="Pelagophytes"
  #
  other<-c("Amoebozoa", "Cryptista", "Discoba")
  df$tax_compiled[df$Supergroup %in% other]="Other"
  return(df)
}
df_tax<-compile_tax(df_wtax_noNA)
unique(df_tax$tax_compiled)
In [113]:
# Generate taxonomy reference
tax_ref <- df_tax %>% 
    select(Taxonomy, Nextlevel, tax_compiled) %>% 
    distinct() %>% 
    data.frame
# write_delim(tax_ref, path = "taxonomic-assign-reference.txt", delim = "\t")
# ^ Use for downstream curation of taxonomic assignments when necessary
A data.frame: 6 × 15
LocationDepthTaxonomyKOmean_CPMSupergroupPhylumClassOrderFamilyGenusSpeciessampleNextleveltax_compiled
<chr><chr><fct><fct><dbl><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr>
1ALOHA1000mAlveolate;K000020.00000000AlveolateALOHA 1000mOtherOther Alveolate
2ALOHA1000mAlveolate;K000030.00000000AlveolateALOHA 1000mOtherOther Alveolate
3ALOHA1000mAlveolate;K000060.00000000AlveolateALOHA 1000mOtherOther Alveolate
4ALOHA1000mAlveolate;K000080.00000000AlveolateALOHA 1000mOtherOther Alveolate
5ALOHA1000mAlveolate;K000100.00000000AlveolateALOHA 1000mOtherOther Alveolate
6ALOHA1000mAlveolate;K000110.01522523AlveolateALOHA 1000mOtherOther Alveolate
In [11]:
# Remove zero counts and sum by curated taxa levels
df_tax_no0 <- subset(df_tax, mean_CPM > 0)
df_tax_sum<-df_tax_no0 %>%
  group_by(Location, Depth, sample, tax_compiled) %>%
  summarise(SUM = sum(mean_CPM), COUNT = n()) %>%
  as.data.frame
`summarise()` regrouping output by 'Location', 'Depth', 'sample' (override with `.groups` argument)

In [42]:
# Factor:
tax_order_compiled<-c("Dinoflagellate","Ciliate","Syndiniales","Other Alveolate","MAST","Diatom","Pelagophytes","Other Stramenopile","Chlorophyta","Other Archaeplastida","Haptophytes","Rhizaria","Opisthokont", "Other")
tax_order_label<-c("Dinoflagellates","Ciliates","Syndiniales","Other Alveolates","MAST","Diatoms","Pelagophytes","Other Stramenopiles","Chlorophytes","Other Archaeplastid","Haptophytes","Rhizaria","Opisthokonts", "Other")
tax_order_color<-c("#612741","#b74a70","#b7757c","#eecfbf","#92462f","#bb603c","#dfa837","#ccc050","#33431e","#93b778","#61ac86","#657abb","#1c1949","#8a8d84")
df_tax_sum$TAX_ORDER<-factor(df_tax_sum$tax_compiled, levels = rev(tax_order_compiled), labels = rev(tax_order_label))
names(tax_order_color)<-(tax_order_label)
head(df_tax_sum[1:2,])
#
sample_list<-c("Catalina surface", "PortofLA surface", "SPOT surface", "ALOHA surface","ALOHA DCM", "ALOHA 150m", "SPOT 150m","SPOT 890m","ALOHA 1000m")
df_tax_sum$SAMPLE_ORDER<-factor(df_tax_sum$sample, levels=rev(sample_list))
#
plot_tax_compiled<-ggplot(df_tax_sum,aes(y = SUM,x = SAMPLE_ORDER,fill=(TAX_ORDER)))+
  geom_bar(stat="identity", position="fill", color="white")+labs(title="", x="",y="Relative abundance CPM")+
  scale_x_discrete(limits=c(), expand = c(0, 0))+
  scale_fill_manual(values=rev(tax_order_color))+
  coord_flip()+
  scale_y_continuous(expand=c(0,0))+
  theme(legend.title=element_blank(),legend.position="bottom",legend.text.align=0, axis.text = element_text(color="black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),panel.border = element_blank(), axis.line = element_line())+
  guides(fill=guide_legend(reverse=TRUE))
A data.frame: 2 × 8
LocationDepthsampletax_compiledSUMCOUNTTAX_ORDERSAMPLE_ORDER
<chr><chr><chr><chr><dbl><int><fct><fct>
1ALOHA1000mALOHA 1000mChlorophyta 6829.1582214ChlorophytesALOHA 1000m
2ALOHA1000mALOHA 1000mCiliate 17989.9173219Ciliates ALOHA 1000m
In [62]:
options(repr.plot.width = 6, repr.plot.height = 5)
# svg("taxa-barplot.svg", w = 6, h = 5)
plot_tax_compiled #w: 645  h: 490
# dev.off()
In [16]:
# Re-aggregate
super_phy<-aggregate(df_tax$mean_CPM, by=list(Location=df_tax$Location, 
                                                    Depth=df_tax$Depth, 
                                                    sample=df_tax$sample, 
                                                    Supergroup=df_tax$Supergroup, 
                                                    Nextlevel=df_tax$Nextlevel), sum)
# head(super_phy)
#
# Factor/order Next level
unique(super_phy$Nextlevel)
next_order<-c("Dinoflagellate","Ciliate","Syndiniales","Apicomplexa","Ochrophyta","Bacillariophyceae","Pelagophyceae","MAST","Bicosoecida","Labyrinthulomycota","Chlorophyta","Rhodophyta","Glaucophyta","Acantharia","Retaria","Cercozoa","Polycystinea","Foraminifera","Choanoflagellatea","Cryptophyta","Discosea","Euglenida","Fungi","Haptophyta","Heterolobosea","Kinetoplastea","Metazoa","Tubulinea","Variosea","Other")
next_color<-c("#67001f","#e7298a","#c994c7","#e7e1ef","#bd0026","#fc4e2a","#feb24c","#d94801","#ffeda0","#fdae6b","#238443","#78c679","#d9f0a3","#08306b","#2171b5","#6baed6","#c6dbef","#9e9ac8","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#54278f","#525252")
super_phy$Nextlevel_order<-factor(super_phy$Nextlevel, levels = next_order)
names(next_color)<-next_order
#
library(ggplot2)
library(RColorBrewer)
#
head(super_phy)
plot_pies<-function(data, supergroup, loc, col){
  ggplot(subset(data, Supergroup %in% supergroup & sample %in% loc),aes(y=x,x=sample,fill=Nextlevel_order))+
    geom_bar(stat="identity", position="fill", color="white")+labs(title="", x=loc,y="")+
    scale_x_discrete(limits=c(), expand = c(0, 0))+
    scale_fill_manual(values = next_color)+
    coord_flip()+
    scale_y_continuous(position = "top", expand=c(0,0))+
    theme(legend.title=element_blank(),legend.position="none",legend.text.align=0, axis.text = element_blank(),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),panel.border = element_blank(), axis.line = element_blank(), axis.ticks = element_blank())+
    coord_polar(theta='y')+
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))+
    NULL
}
  1. 'Acantharia'
  2. 'Apicomplexa'
  3. 'Bacillariophyceae'
  4. 'Bicosoecida'
  5. 'Cercozoa'
  6. 'Chlorophyta'
  7. 'Choanoflagellatea'
  8. 'Ciliate'
  9. 'Cryptophyta'
  10. 'Dinoflagellate'
  11. 'Discosea'
  12. 'Euglenida'
  13. 'Foraminifera'
  14. 'Fungi'
  15. 'Glaucophyta'
  16. 'Haptophyta'
  17. 'Heterolobosea'
  18. 'Kinetoplastea'
  19. 'Labyrinthulomycota'
  20. 'MAST'
  21. 'Metazoa'
  22. 'Ochrophyta'
  23. 'Other'
  24. 'Pelagophyceae'
  25. 'Polycystinea'
  26. 'Retaria'
  27. 'Rhodophyta'
  28. 'Syndiniales'
  29. 'Tubulinea'
  30. 'Variosea'
A data.frame: 6 × 7
LocationDepthsampleSupergroupNextlevelxNextlevel_order
<chr><chr><chr><chr><chr><dbl><fct>
1ALOHA 1000m ALOHA 1000m RhizariaAcantharia2743.0301Acantharia
2ALOHA 150m ALOHA 150m RhizariaAcantharia1752.2302Acantharia
3ALOHA DCM ALOHA DCM RhizariaAcantharia1713.8470Acantharia
4ALOHA surfaceALOHA surface RhizariaAcantharia1312.2637Acantharia
5CatalinasurfaceCatalina surfaceRhizariaAcantharia 790.3904Acantharia
6PortofLAsurfacePortofLA surfaceRhizariaAcantharia 923.6476Acantharia
In [19]:
#
# Generate pie charts for individual taxonomic groups
#

# Alveolates pie plots
leg<-get_legend(plot_pies(super_phy, "Alveolate", "SPOT 150m", next_color)+theme(legend.title=element_blank(),legend.position="bottom", legend.direction = "vertical"))
# svg("alveolate.svg",h=15)
plot_grid(plot_pies(super_phy, "Alveolate", "Catalina surface", next_color),
          plot_pies(super_phy, "Alveolate", "PortofLA surface", next_color),
          plot_pies(super_phy, "Alveolate", "SPOT surface", next_color),
          plot_pies(super_phy, "Alveolate", "ALOHA DCM", next_color),
          plot_pies(super_phy, "Alveolate", "ALOHA 150m", next_color),
          plot_pies(super_phy, "Alveolate", "ALOHA surface", next_color),
          plot_pies(super_phy, "Alveolate", "SPOT 890m", next_color),
          plot_pies(super_phy, "Alveolate", "SPOT 150m", next_color),
          plot_pies(super_phy, "Alveolate", "ALOHA 1000m", next_color),
          leg,
          ncol=1, nrow=10, hjust=-2,vjust=5)

# leg<-get_legend(plot_pies(super_phy, "Stramenopile", "SPOT 150m", next_color)+theme(legend.title=element_blank(),legend.position="bottom", legend.direction = "vertical"))
# # svg("Stramenopile_pies.svg",h=15)
# plot_grid(plot_pies(super_phy, "Stramenopile", "Catalina surface", next_color),
#           plot_pies(super_phy, "Stramenopile", "PortofLA surface", next_color),
#           plot_pies(super_phy, "Stramenopile", "SPOT surface", next_color),
#           plot_pies(super_phy, "Stramenopile", "ALOHA DCM", next_color),
#           plot_pies(super_phy, "Stramenopile", "ALOHA 150m", next_color),
#           plot_pies(super_phy, "Stramenopile", "ALOHA surface", next_color),
#           plot_pies(super_phy, "Stramenopile", "SPOT 890m", next_color),
#           plot_pies(super_phy, "Stramenopile", "SPOT 150m", next_color),
#           plot_pies(super_phy, "Stramenopile", "ALOHA 1000m", next_color),
#           leg,
#           ncol=1, nrow=10, rel_widths = 0.1, rel_heights = 0.01,hjust=-2,vjust=5)
# # dev.off()
# #
# leg<-get_legend(plot_pies(super_phy, "Archaeplastida", "SPOT 150m", next_color)+theme(legend.title=element_blank(),legend.position="bottom", legend.direction = "vertical"))
# # svg("Archaeplastida_pies.svg",h=15)
# plot_grid(plot_pies(super_phy, "Archaeplastida", "Catalina surface", next_color),
#           plot_pies(super_phy, "Archaeplastida", "PortofLA surface", next_color),
#           plot_pies(super_phy, "Archaeplastida", "SPOT surface", next_color),
#           plot_pies(super_phy, "Archaeplastida", "ALOHA DCM", next_color),
#           plot_pies(super_phy, "Archaeplastida", "ALOHA 150m", next_color),
#           plot_pies(super_phy, "Archaeplastida", "ALOHA surface", next_color),
#           plot_pies(super_phy, "Archaeplastida", "SPOT 890m", next_color),
#           plot_pies(super_phy, "Archaeplastida", "SPOT 150m", next_color),
#           plot_pies(super_phy, "Archaeplastida", "ALOHA 1000m", next_color),
#           leg,
#           ncol=1, nrow=10, rel_widths = 0.1, rel_heights = 0.01,hjust=-2,vjust=5)
# # dev.off()
# #
# leg<-get_legend(plot_pies(super_phy, "Rhizaria", "SPOT 150m", next_color)+theme(legend.title=element_blank(),legend.position="bottom", legend.direction = "vertical"))
# # svg("Rhizaria_pies.svg",h=15)
# plot_grid(plot_pies(super_phy, "Rhizaria", "Catalina surface", next_color),
#           plot_pies(super_phy, "Rhizaria", "PortofLA surface", next_color),
#           plot_pies(super_phy, "Rhizaria", "SPOT surface", next_color),
#           plot_pies(super_phy, "Rhizaria", "ALOHA DCM", next_color),
#           plot_pies(super_phy, "Rhizaria", "ALOHA 150m", next_color),
#           plot_pies(super_phy, "Rhizaria", "ALOHA surface", next_color),
#           plot_pies(super_phy, "Rhizaria", "SPOT 890m", next_color),
#           plot_pies(super_phy, "Rhizaria", "SPOT 150m", next_color),
#           plot_pies(super_phy, "Rhizaria", "ALOHA 1000m", next_color),
#           leg,
#           ncol=1, nrow=10, rel_widths = 0.1, rel_heights = 0.01,hjust=-2,vjust=5)
In [46]:
## Supplementary barplots with selected taxa at higher resolution:
head(df_wtax_noNA[1:2,])
#
class<-aggregate(df_tax$mean_CPM, by=list(Location=df_tax$Location, Depth=df_tax$Depth, 
                                          sample=df_tax$sample, Nextlevel=df_tax$Nextlevel, 
                                          Class=df_tax$Class), sum)

class_lev<-c("Dinoflagellate","Ciliate","Bacillariophyceae","Chlorophyta")
class_sub<-subset(class, Nextlevel %in% class_lev)
A data.frame: 2 × 13
LocationDepthTaxonomyKOmean_CPMSupergroupPhylumClassOrderFamilyGenusSpeciessample
<chr><chr><fct><fct><dbl><chr><chr><chr><chr><chr><chr><chr><chr>
1ALOHA1000mAlveolate;K000020AlveolateALOHA 1000m
2ALOHA1000mAlveolate;K000030AlveolateALOHA 1000m
In [56]:
# Ciliate plot
cil<-subset(class_sub, Nextlevel %in% "Ciliate")
sample_list<-c("Catalina surface", "PortofLA surface", "SPOT surface", "ALOHA DCM", "ALOHA 150m", "ALOHA surface", "SPOT 150m", "ALOHA 1000m", "SPOT 890m")
cil$sample_order<-factor(cil$sample, levels=rev(sample_list))
#
plot_ciliate<-ggplot(cil,aes(y=x,x=sample_order,fill=Class))+
  geom_bar(stat="identity", position="fill", color="white")+labs(title="", x="",y="Relative abundance CPM")+
  scale_x_discrete(limits=c(), expand = c(0, 0))+
  # scale_fill_brewer(palette = "PuRd")+
  coord_flip()+
  scale_y_continuous(expand=c(0,0))+
  theme(legend.title=element_blank(),legend.position="bottom",legend.text.align=0, axis.text = element_text(color="black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),panel.border = element_blank(), axis.line = element_line())+
  guides(fill=guide_legend(reverse=TRUE))
# plot_ciliate+scale_fill_brewer(palette = "Set3")
In [57]:
# non-ciliate groups go to the Order level
order<-aggregate(df_tax$mean_CPM, by=list(Location=df_tax$Location, Depth=df_tax$Depth, 
                                          sample=df_tax$sample, Nextlevel=df_tax$Nextlevel,
                                          Order=df_tax$Order), sum)
order_sub<-subset(order, Nextlevel %in% class_lev)
unique(order_sub$Order)
noncil<-c("Dinoflagellate","Bacillariophyceae","Chlorophyta")
# head(order_sub)
# Plot others by order:
order_sub$sample_order<-factor(order_sub$sample, levels=rev(sample_list))
#
plot_order<-ggplot(order_sub,aes(y=x,x=sample_order,fill=Order))+
  geom_bar(stat="identity", position="fill", color="white")+labs(title="", x="",y="Relative abundance CPM")+
  scale_x_discrete(limits=c(), expand = c(0, 0))+
  guides(fill=guide_legend(reverse=TRUE))+
  coord_flip()+
  scale_y_continuous(expand=c(0,0))+
  theme(legend.title=element_blank(),legend.position="bottom",legend.text.align=0, 
        axis.text = element_text(color="black"),panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),panel.background = element_blank(),
        panel.border = element_blank(), axis.line = element_line())
  1. ''
  2. 'Bacillariales'
  3. 'Chlamydomonadales'
  4. 'Chlorodendrales'
  5. 'Choreotrichida'
  6. 'Corethrids'
  7. 'Coscinodiscids'
  8. 'Cyclophorales'
  9. 'Cyclotrichiida'
  10. 'Cymatosirales'
  11. 'Cyrtolophosidida'
  12. 'Dinophysiales'
  13. 'Dolichomastigales'
  14. 'Euplotida'
  15. 'Gonyaulacales'
  16. 'Gymnodiniales'
  17. 'Hemiaulales'
  18. 'Heterotrichia'
  19. 'Incertae sedis'
  20. 'Licmophorales'
  21. 'Lithodesmiales'
  22. 'Mamiellales'
  23. 'Melosirids'
  24. 'Naviculales'
  25. 'Nephroselmidales'
  26. 'Noctilucales'
  27. 'Oxyrrhinales'
  28. 'Peniculida'
  29. 'Peridiniales'
  30. 'Philasterida'
  31. 'Picocystales'
  32. 'Pleurostomatida'
  33. 'Prasinococcales'
  34. 'Prasiola'
  35. 'Prorocentrales'
  36. 'Prorodontida'
  37. 'Protocruziida'
  38. 'Pycnococcaceae'
  39. 'Pyramimonadales'
  40. 'Rhaphoneidales'
  41. 'Rhizosoleniales'
  42. 'Sporadotrichida'
  43. 'Straitellales'
  44. 'Suessiales'
  45. 'Surirellales'
  46. 'Tetrahymenida'
  47. 'Tetrasporales'
  48. 'Thalassionematales'
  49. 'Thalassiophysales'
  50. 'Thalassiosirales'
  51. 'Tintinnida'
  52. 'Triceratiales'
  53. 'Urostylida'
In [53]:
# plot_order %+% subset(order_sub, Nextlevel %in% "Dinoflagellate")+scale_fill_brewer(palette = "Set3")
# #
# plot_order %+% subset(order_sub, Nextlevel %in% "Bacillariophyceae")
# #
# plot_order %+% subset(order_sub, Nextlevel %in% "Chlorophyta")+scale_fill_brewer(palette = "Set3")
In [63]:
options(repr.plot.width = 10, repr.plot.height = 7)

# svg("Suppl_tax_res.svg", h=11,w=17)
plot_grid(plot_order %+% subset(order_sub, Nextlevel %in% "Dinoflagellate")+scale_fill_brewer(palette = "Set3"),
          plot_ciliate+scale_fill_brewer(palette = "Set3"),
          plot_order %+% subset(order_sub, Nextlevel %in% "Chlorophyta")+scale_fill_brewer(palette = "Set3"),
          plot_order %+% subset(order_sub, Nextlevel %in% "Bacillariophyceae"),
          labels=c('a. Dinoflagellates', 'b. Ciliates', 'c. Chlorophytes', 'd. Diatoms'),
         align = "hv")
# dev.off()
Warning message in RColorBrewer::brewer.pal(n, pal):
“n too large, allowed maximum for palette Set3 is 12
Returning the palette you asked for with that many colors
”

Import kegg identities and additional annotation information to reformat normalized count data

In [65]:
load("Normed_avg_annotated_08022018.RData", verbose=T) #available from Zenodo
Loading objects:
  avg_CPM
  df_wtax
  df_wKO_wdups
In [66]:
load("ReNorm_bytax_08022018.RData", verbose=T) #available from Zenodo
Loading objects:
  dfnorm_Dinoflagellate
  dfnorm_Ciliate
  dfnorm_Haptophyta
  dfnorm_Bacillariophyceae
  dfnorm_Chlorophyta
  dfnorm_Pelagophyceae
  dfnorm_MAST
  dfnorm_Rhizaria
  comboTax
In [229]:
# import Kegg list information:

# Derived from available Kegg information
keggs <- read.delim("compile-raw-data/K0_geneIDs_fulllist_08192019.txt",sep="\t",fill=T)

# Curated kegg identities to hone in on specific protistan nutritional modes
custom<-read.delim("Custom_KO_list_30-04-2020.txt",header=TRUE, sep="\t")

tax_ref<-read.csv("taxonomic-assign-reference.txt",row.names=NULL)
Warning message in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
“EOF within quoted string”
In [230]:
# Join whole community data:
wholecomm_wK0<-left_join(avg_CPM, keggs,by="KO")
whole_wcat <- left_join(wholecomm_wK0, custom, by="KO")
# head(whole_wcat)

# Option to save and use R object
save(whole_wcat, file="NormedbyWhole_annotated.RData")
A data.frame: 6 × 18
LocationDepthTaxonomyKOmean_CPMGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;K000020AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolism Threonine biosynthesis, aspartate => homoserine => threonine NANANANANANANANA
3ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismCysteine and methionine metabolismMethionine biosynthesis, apartate => homoserine => methionineNANANANANANANANA
4ALOHA1000mAlveolate;K000060GPD1; glycerol-3-phosphate dehydrogenase (NAD+) [EC:1.1.1.8] NA NA NA NA NANANANANANANANA
5ALOHA1000mAlveolate;K000080SORD, gutB; L-iditol 2-dehydrogenase [EC:1.1.1.14] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
6ALOHA1000mAlveolate;K000100iolG; myo-inositol 2-dehydrogenase / D-chiro-inositol 1-dehydrogenase [EC:1.1.1.18 1.1.1.369]NA NA NA NA NANANANANANANANA
In [251]:
# Join with data that is normalized by individual taxonomic groups
combo_tax_wK0<-left_join(comboTax, keggs,by="KO")

# tax_wcat - KO IDs with taxonomic ID
tax_wcat<-left_join(combo_tax_wK0, custom, by="KO")

# Option to save
# save(tax_wcat, file="Normedbytax_annotated.RData")

UpSetR plot - distribution of taxonomic levels and composition

In [64]:
library(UpSetR)
Attaching package: ‘UpSetR’


The following object is masked from ‘package:lattice’:

    histogram


In [71]:
melt_wKO<-melt(df_wKO_wdups)
melt_wKO$variable<-NULL
# head(melt_wKO)
melt_wKO_noNA<-subset(melt_wKO, !(Taxonomy %in% "Not assigned" | KO %in% "Not assigned" | value == 0))
# head(melt_wKO_noNA) 
# removed zeros and unassigned, KO IDs are duplicated for all applicable modules.
tmp<-melt_wKO_noNA[c(1:5,10)]
melt_wKO_noNA_noDup<-tmp[!duplicated(tmp),]
# head(melt_wKO_noNA_noDup)
toUpset<-melt_wKO_noNA_noDup[c(1:2,3:4,6)]
head(toUpset) # Use below to add annotation
Using Location, Depth, Taxonomy, KO, Gene_identification, A, B, C, D as id variables

A data.frame: 6 × 5
LocationDepthTaxonomyKOvalue
<chr><chr><fct><fct><dbl>
7ALOHA1000mAlveolate;K000110.01522523
8ALOHA1000mAlveolate;K000120.03045045
30ALOHA1000mAlveolate;K000250.03045045
35ALOHA1000mAlveolate;K000260.03045045
41ALOHA1000mAlveolate;K000294.33418666
45ALOHA1000mAlveolate;K000310.16427429
In [74]:
# Format for UpSetR plots:
toUpset$Uniq<-paste(toUpset$Taxonomy, toUpset$KO, sep="_")
toUpset$sample<-paste(toUpset$Location, toUpset$Depth, sep="_")
# Change to binary
summed<-aggregate(toUpset$value, by=list(sample=toUpset$sample, Uniq=toUpset$Uniq),sum)
summed$bin<-ifelse(summed$x > 0, 1, 0)
binary_wide<-dcast(summed[c(1,2,4)], Uniq~sample, fill=0)
row.names(binary_wide)<-binary_wide$Uniq; binary_wide$Uniq<-NULL
head(binary_wide)
In [81]:
options(repr.plot.width = 16, repr.plot.height = 7)
upset(binary_wide, keep.order = T, group.by=c("both"), nsets=13, number.angles = 45, text.scale=2, intersections=list(list("ALOHA_surface","Catalina_surface","PortofLA_surface","SPOT_150m","SPOT_890m","SPOT_surface","ALOHA_150m","ALOHA_DCM","ALOHA_1000m"),list("ALOHA_surface","ALOHA_150m","ALOHA_DCM","ALOHA_1000m"),list("ALOHA_surface","ALOHA_150m","ALOHA_DCM"),list("ALOHA_150m","ALOHA_DCM"),list("ALOHA_surface","ALOHA_DCM"),list("Catalina_surface","PortofLA_surface","SPOT_150m","SPOT_890m","SPOT_surface"),list("Catalina_surface","PortofLA_surface","SPOT_surface"),list("Catalina_surface","PortofLA_surface"),list("SPOT_150m","SPOT_890m","SPOT_surface"),list("SPOT_150m","SPOT_890m"),list("ALOHA_surface","Catalina_surface","PortofLA_surface","SPOT_surface"),list("ALOHA_surface","Catalina_surface","PortofLA_surface","SPOT_surface","ALOHA_150m","ALOHA_DCM"),list("SPOT_150m","SPOT_890m","ALOHA_1000m"),list("SPOT_150m","SPOT_890m","ALOHA_150m","ALOHA_DCM","ALOHA_1000m"),list("ALOHA_surface"),list("Catalina_surface"),list("PortofLA_surface"),list("SPOT_150m"),list("SPOT_890m"),list("SPOT_surface"),list("ALOHA_150m"),list("ALOHA_DCM"),list("ALOHA_1000m")))
In [ ]:
# Dissect binary wide further - color by specific categories:
#
tax <- read.delim("taxonomic-assign-reference.txt")
# head(tax)
# head(binary_wide[1:2,])
binary_wide$annot<-row.names(binary_wide) # pull out row names as a category to annotate
tmp<-colsplit(binary_wide$annot, "_", c("Taxonomy", "KO"));head(tmp)
tmp_wtax<-left_join(tmp, tax2, by="Taxonomy")
In [84]:
df<-binary_wide[1:9]
# head(df)
df$Intersect <- apply(df > 0, 1, function(x){toString(names(df)[x])})
head(df[1:3,])
A data.frame: 6 × 9
ALOHA_1000mALOHA_150mALOHA_DCMALOHA_surfaceCatalina_surfacePortofLA_surfaceSPOT_150mSPOT_890mSPOT_surface
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
Alveolate;_K00002000111111
Alveolate;_K00003011100111
Alveolate;_K00006000101111
Alveolate;_K00008011111111
Alveolate;_K00010010001001
Alveolate;_K00011111111111
A data.frame: 3 × 10
ALOHA_1000mALOHA_150mALOHA_DCMALOHA_surfaceCatalina_surfacePortofLA_surfaceSPOT_150mSPOT_890mSPOT_surfaceIntersect
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr>
Alveolate;_K00002000111111ALOHA_surface, Catalina_surface, PortofLA_surface, SPOT_150m, SPOT_890m, SPOT_surface
Alveolate;_K00003011100111ALOHA_150m, ALOHA_DCM, ALOHA_surface, SPOT_150m, SPOT_890m, SPOT_surface
Alveolate;_K00006000101111ALOHA_surface, PortofLA_surface, SPOT_150m, SPOT_890m, SPOT_surface
In [119]:
binary_tax <- df %>%
    rownames_to_column(var = "uniq") %>% 
    separate(uniq, c("Taxonomy", "KEGGID"), sep = "_", remove = FALSE) %>% 
    inner_join(tax, by = "Taxonomy") %>% 
    column_to_rownames(var = "uniq") %>% 
    data.frame
head(binary_tax[1:2,])
unique(binary_tax$tax_compiled)
A data.frame: 2 × 14
TaxonomyKEGGIDALOHA_1000mALOHA_150mALOHA_DCMALOHA_surfaceCatalina_surfacePortofLA_surfaceSPOT_150mSPOT_890mSPOT_surfaceIntersectNextleveltax_compiled
<chr><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr><fct><fct>
Alveolate;_K00002Alveolate;K00002000111111ALOHA_surface, Catalina_surface, PortofLA_surface, SPOT_150m, SPOT_890m, SPOT_surfaceOtherOther Alveolate
Alveolate;_K00003Alveolate;K00003011100111ALOHA_150m, ALOHA_DCM, ALOHA_surface, SPOT_150m, SPOT_890m, SPOT_surface OtherOther Alveolate
  1. Other Alveolate
  2. Ciliate
  3. Dinoflagellate
  4. Syndiniales
  5. Other
  6. Other Archaeplastida
  7. Chlorophyta
  8. Haptophytes
  9. Opisthokont
  10. Rhizaria
  11. Other Stramenopile
  12. MAST
  13. Diatom
  14. Pelagophytes
Levels:
  1. 'Chlorophyta'
  2. 'Ciliate'
  3. 'Diatom'
  4. 'Dinoflagellate'
  5. 'Haptophytes'
  6. 'MAST'
  7. 'Opisthokont'
  8. 'Other'
  9. 'Other Alveolate'
  10. 'Other Archaeplastida'
  11. 'Other Stramenopile'
  12. 'Pelagophytes'
  13. 'Rhizaria'
  14. 'Syndiniales'
In [120]:
# Factor plot
intersect_order<-c("ALOHA_1000m, ALOHA_150m, ALOHA_DCM, ALOHA_surface, Catalina_surface, PortofLA_surface, SPOT_150m, SPOT_890m, SPOT_surface","ALOHA_1000m, ALOHA_150m, ALOHA_DCM, ALOHA_surface","ALOHA_150m, ALOHA_DCM, ALOHA_surface","ALOHA_150m, ALOHA_DCM","ALOHA_DCM, ALOHA_surface","Catalina_surface, PortofLA_surface, SPOT_150m, SPOT_890m, SPOT_surface","Catalina_surface, PortofLA_surface, SPOT_surface","Catalina_surface, PortofLA_surface","SPOT_150m, SPOT_890m, SPOT_surface","SPOT_150m, SPOT_890m","ALOHA_surface, Catalina_surface, PortofLA_surface, SPOT_surface","ALOHA_150m, ALOHA_DCM, ALOHA_surface, Catalina_surface, PortofLA_surface, SPOT_surface","ALOHA_1000m, SPOT_150m, SPOT_890m","ALOHA_1000m, ALOHA_150m, ALOHA_DCM, SPOT_150m, SPOT_890m","ALOHA_surface","Catalina_surface","PortofLA_surface","SPOT_150m","SPOT_890m","SPOT_surface","ALOHA_150m","ALOHA_DCM","ALOHA_1000m")
binary_tax$order_x<-factor(binary_tax$Intersect, levels = intersect_order) 
tax_order<-c("Dinoflagellate","Ciliate","Syndiniales","Other Alveolate","MAST","Diatom","Pelagophytes","Other Stramenopile","Chlorophyta","Other Archaeplastida","Haptophytes","Rhizaria","Opisthokont", "Other")
tax_order_label<-c("Dinoflagellates","Ciliates","Syndiniales","Other Alveolates","MAST","Diatoms","Pelagophytes","Other Stramenopile","Chlorophytes","Other Archaeplastid","Haptophytes","Rhizaria","Opisthokonts", "Other")
tax_order_color<-c("#612741","#b74a70","#b7757c","#eecfbf","#92462f","#bb603c","#dfa837","#ccc050","#33431e","#93b778","#61ac86","#657abb","#1c1949","#8a8d84")
binary_tax$TAX_ORDER<-factor(binary_tax$tax_compiled, levels = (tax_order), labels = (tax_order_label))
names(tax_order_color)<-(tax_order_label)
In [121]:
plot_tax_bin<-ggplot(binary_tax, aes(x=order_x, fill=TAX_ORDER))+
  geom_bar(stat = "count", position="stack")+
  theme(axis.text.x = element_text(angle=45, size=0.8))+
  scale_x_discrete(limits=c(), expand = c(0, 0))+
  scale_fill_manual(values=(tax_order_color))+
  scale_y_continuous(position = "top", expand=c(0,0))
#
options(repr.plot.width = 16, repr.plot.height = 7)
plot_tax_bin %+% subset(binary_tax, Intersect %in% intersect_order) #w:1080 h:650
Warning message:
“Position guide is perpendicular to the intended axis. Did you mean to specify a different guide `position`?”
Warning message:
“guide_axis(): Discarding guide on merge. Do you have more than one guide with the same position?”
Warning message:
“guide_axis(): Discarding guide on merge. Do you have more than one guide with the same position?”
Warning message:
“guide_axis(): Discarding guide on merge. Do you have more than one guide with the same position?”
Warning message:
“guide_axis(): Discarding guide on merge. Do you have more than one guide with the same position?”

Ordination analysis

In [122]:
custom_paths <- read.delim("Custom_KO_list_11212019.txt", header = TRUE)
load("Normedbytax_annotated.RData", verbose = T)
tax_wcat$Category<- NULL
tax_wcat_subset <- inner_join(tax_wcat, custom_paths, by="KO")
# head(tax_wcat_subset)
Loading objects:
  tax_wcat
In [126]:
# Write function to subset specific taxa
# library(tidyverse)
generate_pcoa <- function(df, tax) {
  tax_wide <- df %>%
    filter(taxa == tax) %>%
    group_by(sample, KO) %>%
    summarise(MEAN = mean(mean_count)) %>%
    pivot_wider(names_from = KO, values_from = MEAN) %>%
    data.frame
  row.names(tax_wide) <- tax_wide$sample
  tax_wide$sample <- NULL
  jac_bytax <- vegdist(tax_wide, method = "jaccard")
  pcoa_bytax <- princomp(jac_bytax)
  plot(pcoa_bytax)
  return(pcoa_bytax)
}
##
##
# Function to plot output PCOA
plot_pcoa <- function(pcoa_in, tax) {
  tmp <- data.frame(pcoa_in$scores)
  tmp$SAMPLE <- row.names(tmp)
  df <- separate(tmp, col = SAMPLE, c("Location", "Depth"), sep = "_", remove = FALSE)
  df$TAXA <- tax
  eigenvalues<-round(pcoa_in$sdev, 4)*100
  # Factoring
  sample_list<-c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface","ALOHA_DCM", "ALOHA_150m", "SPOT_150m","SPOT_890m","ALOHA_1000m")
  df$SAMPLE_ORDER<-factor(df$SAMPLE, levels=sample_list)
  sample_alpha <- c(1,1,1,0.5,0.5,0.5,1,1,0.5)
  depth_order <- c("surface", "DCM", "150m", "890m", "1000m")
  df$DEPTH_ORDER <-factor(df$Depth, levels = depth_order)
  DEPTH_SHAPE <- c(24,25,21,22,23)
  loc_order <-c("ALOHA", "Catalina", "PortofLA", "SPOT")
  df$LOC_ORDER <- factor(df$Location, levels = loc_order)
  LOC_COL <-c("#c51b7d","#fee08b","#1a9850","#4575b4")
  names(LOC_COL)<-loc_order
  tax_order<-c("Dinoflagellate","Ciliate","MAST","Bacillariophyceae","Pelagophyceae","Chlorophyta","Haptophyta","Rhizaria")
  df$TAX_ORDER <- factor(df$TAXA, levels= tax_order)
  TAX_COL<-c("#612741","#b74a70","#92462f","#bb603c","#dfa837","#33431e","#61ac86","#657abb")
  names(TAX_COL)<-tax_order
  ggplot(df, aes(x=Comp.1, y=Comp.2, fill=TAX_ORDER, shape=DEPTH_ORDER)) +
    geom_hline(yintercept=0, color="black")+
    geom_vline(xintercept=0, color="black")+
    geom_point(stat="identity", color="black", size=4, aes(alpha = SAMPLE_ORDER))+
    scale_fill_manual(values = TAX_COL)+
    scale_alpha_manual(values = sample_alpha)+
    scale_shape_manual(values = DEPTH_SHAPE)+
    labs(x=eigenvalues[1], y=eigenvalues[2], title = tax)+
    theme_minimal()+
    theme(axis.text = element_text(color="black", face="bold"),
          panel.grid.major = element_line(color="grey"))+
    guides(fill = guide_legend(override.aes = list(shape = 21)),
           shape = guide_legend(override.aes = list(fill = "black")))
    # scale_x_continuous(limits = c(-2, 2))+
    # scale_y_continuous(limits = c(-1.2, 1.2))
}
unique(tax_wcat_subset$taxa)
  1. 'Dinoflagellate'
  2. 'Ciliate'
  3. 'Haptophyta'
  4. 'Bacillariophyceae'
  5. 'Chlorophyta'
  6. 'Pelagophyceae'
  7. 'MAST'
  8. 'Rhizaria'
In [128]:
options(repr.plot.width = 7, repr.plot.height = 6)
pc_dinos <- generate_pcoa(tax_wcat_subset, "Dinoflagellate")
pc_ciliate <- generate_pcoa(tax_wcat_subset, "Ciliate")
pc_mast <- generate_pcoa(tax_wcat_subset, "MAST")
pc_rhiz <- generate_pcoa(tax_wcat_subset, "Rhizaria")
pc_hapto <- generate_pcoa(tax_wcat_subset, "Haptophyta")
plot_pcoa(pc_dinos, "Dinoflagellate")
plot_pcoa(pc_ciliate, "Ciliate")
plot_pcoa(pc_mast, "MAST")
plot_pcoa(pc_rhiz, "Rhizaria")
plot_pcoa(pc_hapto, "Haptophyta")
`summarise()` regrouping output by 'sample' (override with `.groups` argument)

`summarise()` regrouping output by 'sample' (override with `.groups` argument)

`summarise()` regrouping output by 'sample' (override with `.groups` argument)

`summarise()` regrouping output by 'sample' (override with `.groups` argument)

`summarise()` regrouping output by 'sample' (override with `.groups` argument)

In [130]:
load("NormedbyWhole_annotated.RData", verbose = T)
tax <- read.delim("taxonomic-assign-reference.txt")
Loading objects:
  whole_wcat
In [132]:
head(whole_wcat)
head(tax)
A data.frame: 6 × 15
LocationDepthTaxonomyKOmean_CPMGene_identificationABCDTargetCategoryGeneIDFull_GeneNameNotes
<chr><chr><fct><chr><dbl><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;K000020AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANA
2ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolism Threonine biosynthesis, aspartate => homoserine => threonine NANANANANA
3ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismCysteine and methionine metabolismMethionine biosynthesis, apartate => homoserine => methionineNANANANANA
4ALOHA1000mAlveolate;K000060GPD1; glycerol-3-phosphate dehydrogenase (NAD+) [EC:1.1.1.8] NA NA NA NA NANANANANA
5ALOHA1000mAlveolate;K000080SORD, gutB; L-iditol 2-dehydrogenase [EC:1.1.1.14] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANA
6ALOHA1000mAlveolate;K000100iolG; myo-inositol 2-dehydrogenase / D-chiro-inositol 1-dehydrogenase [EC:1.1.1.18 1.1.1.369]NA NA NA NA NANANANANA
A data.frame: 6 × 3
TaxonomyNextleveltax_compiled
<fct><fct><fct>
1Alveolate; Other Other Alveolate
2Alveolate;Apicomplexa; ApicomplexaOther Alveolate
3Alveolate;Apicomplexa;Chromerida; ApicomplexaOther Alveolate
4Alveolate;Apicomplexa;Chromerida;Chromeraceae; ApicomplexaOther Alveolate
5Alveolate;Apicomplexa;Chromerida;Chromeraceae;Incertae sedis; ApicomplexaOther Alveolate
6Alveolate;Apicomplexa;Chromerida;Chromeraceae;Incertae sedis;Chromera;ApicomplexaOther Alveolate
In [505]:
head(whole_wcat)
A data.frame: 6 × 18
LocationDepthTaxonomyKOmean_CPMGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;K000020AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolism Threonine biosynthesis, aspartate => homoserine => threonine NANANANANANANANA
3ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismCysteine and methionine metabolismMethionine biosynthesis, apartate => homoserine => methionineNANANANANANANANA
4ALOHA1000mAlveolate;K000060GPD1; glycerol-3-phosphate dehydrogenase (NAD+) [EC:1.1.1.8] NA NA NA NA NANANANANANANANA
5ALOHA1000mAlveolate;K000080SORD, gutB; L-iditol 2-dehydrogenase [EC:1.1.1.14] Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
6ALOHA1000mAlveolate;K000100iolG; myo-inositol 2-dehydrogenase / D-chiro-inositol 1-dehydrogenase [EC:1.1.1.18 1.1.1.369]NA NA NA NA NANANANANANANANA
In [506]:
unigene_prof <- whole_wcat %>% 
    unite(SAMPLE, Location, Depth, sep = "_", remove = FALSE) %>% 
    unite(KO_TAX, KO, Taxonomy, sep = "_", remove = FALSE) %>% 
    # If grouping at curated taxonomic level, import tax ref and use to group.
#     left_join(tax, by = "Taxonomy") %>% 
#     filter(!is.na(tax_compiled)) %>% 
    select(SAMPLE, KO_TAX, mean_CPM) %>% 
    distinct() %>% 
    filter(mean_CPM > 0) %>% 
    pivot_wider(id_cols = SAMPLE, names_from = KO_TAX, values_from = mean_CPM, 
                values_fill = list(mean_CPM = 0),
                values_fn = sum) %>% 
    column_to_rownames(var = "SAMPLE") %>% 
    data.frame
head(unigene_prof[1:5])
A data.frame: 6 × 5
K00011_Alveolate.K00012_Alveolate.K00025_Alveolate.K00026_Alveolate.K00029_Alveolate.
<dbl><dbl><dbl><dbl><dbl>
ALOHA_1000m0.015225230.030450450.030450450.030450454.3341867
ALOHA_150m0.022335543.824104660.011167771.105609450.3706372
ALOHA_DCM0.271545040.988321140.216577250.645211621.7209336
ALOHA_surface0.734785205.920852751.298426341.489875451.2622815
Catalina_surface0.198783771.311143280.305338581.536026389.8580289
PortofLA_surface0.010424711.991970400.369076740.905167546.4029598
In [507]:
# Jaccard dist metric:
library(vegan)
jac<-vegdist(unigene_prof, method="jaccard")
pcoa<-princomp(jac)
plot(pcoa)
In [508]:
tmp<-data.frame(pcoa$scores)
tmp$SAMPLE<-row.names(tmp)
out<-colsplit(tmp$SAMPLE, "_", c("Loc", "Depth"))
pcoa_all_df<-data.frame(tmp, out)
eigenvalues<-round(pcoa$sdev, 4)*100
unique(pcoa_all_df$Depth)
# Factoring
depth_order <- c("surface", "DCM", "150m", "890m", "1000m")
pcoa_all_df$DEPTH_ORDER <-factor(pcoa_all_df$Depth, levels = depth_order)
DEPTH_COLOR <- c("#f768a1","#fe9929","#41ab5d", "#084594", "#084594")
names(DEPTH_COLOR)<-depth_order
# DEPTH_SHAPE <- c(24,25,21,22,23)
#
loc_order <-c("ALOHA", "Catalina", "PortofLA", "SPOT")
pcoa_all_df$LOC_ORDER <- factor(pcoa_all_df$Loc, levels = loc_order)
# LOC_COL <-c("#c51b7d","#fee08b","#1a9850","#4575b4")
LOC_SHAPE <- c(5, 18, 15, 19)
names(LOC_SHAPE)<-loc_order
# loc_alpha <- c(0.5, 1, 1, 1)
  1. '1000m'
  2. '150m'
  3. 'DCM'
  4. 'surface'
  5. '890m'
In [512]:
byloc_PCA <- ggplot(pcoa_all_df, aes(x=Comp.1, y=Comp.2, fill=DEPTH_ORDER, shape=LOC_ORDER)) +
  geom_hline(yintercept=0, color="#525252")+
  geom_vline(xintercept=0, color="#525252")+
  geom_point(stat="identity", size=6, aes(fill=DEPTH_ORDER, color = DEPTH_ORDER, shape=LOC_ORDER))+
  scale_fill_manual(values = DEPTH_COLOR)+
  scale_color_manual(values = DEPTH_COLOR)+
  scale_shape_manual(values = LOC_SHAPE)+
  labs(x=paste(eigenvalues[1], "%"), y=paste(eigenvalues[2], "%"))+
  theme_minimal()+
  theme(axis.text = element_text(color="black", face="bold", size = 12),
        axis.title=element_text(size=14,face="bold"),
        panel.grid.major = element_line(color="grey"),
        legend.title = element_blank())+
#   scale_x_continuous(limits = c(-0.5, 0.75))+
#   scale_y_continuous(limits = c(-0.5, 0.25))+
  guides(fill = guide_legend(override.aes=list(shape=22)))
In [522]:
options(repr.plot.width = 6, repr.plot.height = 5)
# svg("pca-metaT-all.svg", w = 6, h = 5)
byloc_PCA
# dev.off()
pdf: 2
In [148]:
library(plotly)
Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:stats’:

    filter


The following object is masked from ‘package:graphics’:

    layout


In [518]:
save(pcoa_all_df, eigenvalues, pcoa, file = "pcoa-input-2020.RData")
In [516]:
pca_3d <- plot_ly(pcoa_all_df,x=~Comp.1,y=~Comp.2,z=~Comp.3,symbol=~Loc, color=~Depth,
      colors=c("#084594","#41ab5d","#084594","#fe9929","#f768a1"), symbols = c(5,18,15,19))%>%
  layout(title='PCA unigene profiles - ALOHA & CA',
         scene=list(xaxis=list(title=paste0('PC1 ',eigenvalues[1],'%'),
                               scale=pcoa$sdev[1]),
                    yaxis=list(title=paste0('PC2 ',eigenvalues[2],'%'),
                               scale=pcoa$sdev[2]),
                    zaxis=list(title=paste0('PC3 ',eigenvalues[3],'%'),
                               scale=pcoa$sdev[3])))
In [517]:
options(repr.plot.width = 6, repr.plot.height = 20)
pca_3d
No trace type specified:
  Based on info supplied, a 'scatter3d' trace seems appropriate.
  Read more about this trace type -> https://plot.ly/r/reference/#scatter3d

No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

No trace type specified:
  Based on info supplied, a 'scatter3d' trace seems appropriate.
  Read more about this trace type -> https://plot.ly/r/reference/#scatter3d

No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Heatmaps

In [232]:
# load("NormedbyWhole_annotated.RData", verbose=T)
head(whole_wcat[1:2,])
A data.frame: 2 × 18
LocationDepthTaxonomyKOmean_CPMGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;K000020AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;K000030E1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolismThreonine biosynthesis, aspartate => homoserine => threonineNANANANANANANANA
In [240]:
unique(whole_wcat$Target) #Upper level categories
unique(whole_wcat$Target_2) 
unique(whole_wcat$Category) # Use for categories to plot
  1. <NA>
  2. Phagotrophy
  3. Phagotrophy-other
  4. Energy Acquisition
  5. C metabolism
  6. Nutrient processing
Levels:
  1. 'C metabolism'
  2. 'Energy Acquisition'
  3. 'Nutrient processing'
  4. 'Phagotrophy'
  5. 'Phagotrophy-other'
  1. <NA>
  2. Fatty acid breakdown
  3. other
  4. Fatty acid biosynthesis
  5. Glyoxylate cycle
  6. TCA cycle
  7. Calvin cycle
  8. Gluconeogenesis-Glycolysis
  9. PDH
  10. GS/GOGAT
  11. Inorganic N uptake and assimilation
  12. Nitrate reduction (assimilatory)
  13. Organic N uptake and assimilation
  14. Urea cycle
  15. Chitinase
  16. Endocytosis
  17. Phagosome associated
  18. Motility and prey recognition
  19. P metabolism
  20. Lysosome associated
  21. V-type ATPase
  22. NRT
  23. Photosynthesis
  24. AMT
  25. SNARE complex
  26. Actin polymerization
Levels:
  1. 'Actin polymerization'
  2. 'AMT'
  3. 'Calvin cycle'
  4. 'Chitinase'
  5. 'Endocytosis'
  6. 'Fatty acid biosynthesis'
  7. 'Fatty acid breakdown'
  8. 'Gluconeogenesis-Glycolysis'
  9. 'Glyoxylate cycle'
  10. 'GS/GOGAT'
  11. 'Inorganic N uptake and assimilation'
  12. 'Lysosome associated'
  13. 'Motility and prey recognition'
  14. 'Nitrate reduction (assimilatory)'
  15. 'NRT'
  16. 'Organic N uptake and assimilation'
  17. 'other'
  18. 'P metabolism'
  19. 'PDH'
  20. 'Phagosome associated'
  21. 'Photosynthesis'
  22. 'SNARE complex'
  23. 'TCA cycle'
  24. 'Urea cycle'
  25. 'V-type ATPase'
  1. <NA>
  2. Fatty acid breakdown
  3. other
  4. Fatty acid biosynthesis
  5. Glyoxylate cycle
  6. TCA cycle
  7. additional breakdown
  8. Calvin cycle
  9. Gluconeogenesis
  10. Glycolysis
  11. PDH
  12. GS/GOGAT
  13. Inorganic N uptake and assimilation
  14. Nitrate reduction (assimilatory)
  15. Organic N uptake and assimilation
  16. Urea cycle
  17. Triacylglycerol biosynthesis
  18. Chitinase
  19. phosphatidylinositol
  20. Endocytosis
  21. Phagosome maturation
  22. Motility and prey recognition
  23. P metabolism
  24. Lysosome binding and processing
  25. V-type ATPase
  26. NRT
  27. Photosynthesis
  28. AMT
  29. por
  30. SNARE complex
  31. else
  32. Actin polymerization
Levels:
  1. 'Actin polymerization'
  2. 'additional breakdown'
  3. 'AMT'
  4. 'Calvin cycle'
  5. 'Chitinase'
  6. 'else'
  7. 'Endocytosis'
  8. 'Fatty acid biosynthesis'
  9. 'Fatty acid breakdown'
  10. 'Gluconeogenesis'
  11. 'Glycolysis'
  12. 'Glyoxylate cycle'
  13. 'GS/GOGAT'
  14. 'Inorganic N uptake and assimilation'
  15. 'Lysosome binding and processing'
  16. 'Motility and prey recognition'
  17. 'Nitrate reduction (assimilatory)'
  18. 'NRT'
  19. 'Organic N uptake and assimilation'
  20. 'other'
  21. 'P metabolism'
  22. 'PDH'
  23. 'Phagosome maturation'
  24. 'phosphatidylinositol'
  25. 'Photosynthesis'
  26. 'por'
  27. 'SNARE complex'
  28. 'TCA cycle'
  29. 'Triacylglycerol biosynthesis'
  30. 'Urea cycle'
  31. 'V-type ATPase'
In [241]:
by_category_WHOLE <- whole_wcat %>% 
    unite(SAMPLE, Location, Depth, sep = "_", remove = FALSE) %>% 
    select(SAMPLE, Taxonomy, Target, Category, KO, Gene_identification, mean_CPM) %>% 
    filter(!is.na(Category)) %>% 
    filter(mean_CPM > 0) %>% 
    distinct() %>% 
    group_by(SAMPLE, Taxonomy, Target, Category, KO, Gene_identification) %>%
    summarise(unigene_sum=sum(mean_CPM)) %>%
    na.omit() %>%
    data.frame
head(by_category_WHOLE[1:3,])
unique(by_category_WHOLE$Target)
`summarise()` regrouping output by 'SAMPLE', 'Taxonomy', 'Target', 'Category', 'KO' (override with `.groups` argument)

A data.frame: 3 × 7
SAMPLETaxonomyTargetCategoryKOGene_identificationunigene_sum
<chr><fct><fct><fct><fct><fct><dbl>
1ALOHA_1000mAlveolate;C metabolismCalvin cycle K00134GAPDH, gapA; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]0.48861111
2ALOHA_1000mAlveolate;C metabolismCalvin cycle K00615E2.2.1.1, tktA, tktB; transketolase [EC:2.2.1.1] 0.01522523
3ALOHA_1000mAlveolate;C metabolismGluconeogenesisK00134GAPDH, gapA; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]0.48861111
  1. C metabolism
  2. Energy Acquisition
  3. Nutrient processing
  4. Phagotrophy
  5. Phagotrophy-other
Levels:
  1. 'C metabolism'
  2. 'Energy Acquisition'
  3. 'Nutrient processing'
  4. 'Phagotrophy'
  5. 'Phagotrophy-other'
In [242]:
unique(by_category_WHOLE$Target)
unique(by_category_WHOLE$Category)
  1. C metabolism
  2. Energy Acquisition
  3. Nutrient processing
  4. Phagotrophy
  5. Phagotrophy-other
Levels:
  1. 'C metabolism'
  2. 'Energy Acquisition'
  3. 'Nutrient processing'
  4. 'Phagotrophy'
  5. 'Phagotrophy-other'
  1. Calvin cycle
  2. Gluconeogenesis
  3. Glycolysis
  4. por
  5. Glyoxylate cycle
  6. TCA cycle
  7. AMT
  8. GS/GOGAT
  9. Inorganic N uptake and assimilation
  10. Organic N uptake and assimilation
  11. P metabolism
  12. Urea cycle
  13. Actin polymerization
  14. Chitinase
  15. Endocytosis
  16. Fatty acid biosynthesis
  17. Fatty acid breakdown
  18. Lysosome binding and processing
  19. Motility and prey recognition
  20. Phagosome maturation
  21. SNARE complex
  22. V-type ATPase
  23. additional breakdown
  24. else
  25. phosphatidylinositol
  26. PDH
  27. other
  28. Triacylglycerol biosynthesis
  29. Photosynthesis
  30. Nitrate reduction (assimilatory)
  31. NRT
Levels:
  1. 'Actin polymerization'
  2. 'additional breakdown'
  3. 'AMT'
  4. 'Calvin cycle'
  5. 'Chitinase'
  6. 'else'
  7. 'Endocytosis'
  8. 'Fatty acid biosynthesis'
  9. 'Fatty acid breakdown'
  10. 'Gluconeogenesis'
  11. 'Glycolysis'
  12. 'Glyoxylate cycle'
  13. 'GS/GOGAT'
  14. 'Inorganic N uptake and assimilation'
  15. 'Lysosome binding and processing'
  16. 'Motility and prey recognition'
  17. 'Nitrate reduction (assimilatory)'
  18. 'NRT'
  19. 'Organic N uptake and assimilation'
  20. 'other'
  21. 'P metabolism'
  22. 'PDH'
  23. 'Phagosome maturation'
  24. 'phosphatidylinositol'
  25. 'Photosynthesis'
  26. 'por'
  27. 'SNARE complex'
  28. 'TCA cycle'
  29. 'Triacylglycerol biosynthesis'
  30. 'Urea cycle'
  31. 'V-type ATPase'
In [243]:
# Total KOs
length(unique(by_category_WHOLE$KO))
#
# Subset to relevant categories
unique(by_category_WHOLE$Target)
rm <- c("butyryl", "other", "else", "additional breakdown")
whole_targetsonly <- by_category_WHOLE %>% 
    filter(!(Target == "Phagotrophy-other")) %>% 
    filter(!(Category %in% rm)) %>% 
    data.frame
    
# Sum to show whole community profiles for targetted pathways
whole_targetSUM <- whole_targetsonly %>%
  group_by(SAMPLE, Target, Category) %>%
  summarise(SUM = sum(unigene_sum)) %>%
  data.frame
head(whole_targetSUM)
378
  1. C metabolism
  2. Energy Acquisition
  3. Nutrient processing
  4. Phagotrophy
  5. Phagotrophy-other
Levels:
  1. 'C metabolism'
  2. 'Energy Acquisition'
  3. 'Nutrient processing'
  4. 'Phagotrophy'
  5. 'Phagotrophy-other'
`summarise()` regrouping output by 'SAMPLE', 'Target' (override with `.groups` argument)

A data.frame: 6 × 4
SAMPLETargetCategorySUM
<chr><fct><fct><dbl>
1ALOHA_1000mC metabolism Calvin cycle 960.24948
2ALOHA_1000mC metabolism Gluconeogenesis 1418.79545
3ALOHA_1000mC metabolism Glycolysis 1674.16654
4ALOHA_1000mC metabolism por 481.31378
5ALOHA_1000mEnergy AcquisitionGlyoxylate cycle1276.79267
6ALOHA_1000mEnergy AcquisitionPhotosynthesis 23.89921
In [245]:
unique(whole_targetSUM$Target)
unique(whole_targetSUM$Category)
length(unique(whole_targetSUM$Category))
  1. C metabolism
  2. Energy Acquisition
  3. Nutrient processing
  4. Phagotrophy
Levels:
  1. 'C metabolism'
  2. 'Energy Acquisition'
  3. 'Nutrient processing'
  4. 'Phagotrophy'
  5. 'Phagotrophy-other'
  1. Calvin cycle
  2. Gluconeogenesis
  3. Glycolysis
  4. por
  5. Glyoxylate cycle
  6. Photosynthesis
  7. TCA cycle
  8. AMT
  9. GS/GOGAT
  10. Inorganic N uptake and assimilation
  11. Nitrate reduction (assimilatory)
  12. NRT
  13. Organic N uptake and assimilation
  14. P metabolism
  15. PDH
  16. Urea cycle
  17. Actin polymerization
  18. Chitinase
  19. Endocytosis
  20. Fatty acid biosynthesis
  21. Fatty acid breakdown
  22. Lysosome binding and processing
  23. Motility and prey recognition
  24. Phagosome maturation
  25. SNARE complex
  26. V-type ATPase
Levels:
  1. 'Actin polymerization'
  2. 'additional breakdown'
  3. 'AMT'
  4. 'Calvin cycle'
  5. 'Chitinase'
  6. 'else'
  7. 'Endocytosis'
  8. 'Fatty acid biosynthesis'
  9. 'Fatty acid breakdown'
  10. 'Gluconeogenesis'
  11. 'Glycolysis'
  12. 'Glyoxylate cycle'
  13. 'GS/GOGAT'
  14. 'Inorganic N uptake and assimilation'
  15. 'Lysosome binding and processing'
  16. 'Motility and prey recognition'
  17. 'Nitrate reduction (assimilatory)'
  18. 'NRT'
  19. 'Organic N uptake and assimilation'
  20. 'other'
  21. 'P metabolism'
  22. 'PDH'
  23. 'Phagosome maturation'
  24. 'phosphatidylinositol'
  25. 'Photosynthesis'
  26. 'por'
  27. 'SNARE complex'
  28. 'TCA cycle'
  29. 'Triacylglycerol biosynthesis'
  30. 'Urea cycle'
  31. 'V-type ATPase'
26
In [246]:
options(repr.plot.width = 6, repr.plot.height = 6)
hist(log(whole_targetSUM$SUM)) #Normal distribution
In [247]:
# Format for pheatmap plots
# Make heat map:
whole_targetSUM$Target <- NULL
#
df_w<-dcast(whole_targetSUM,Category~SAMPLE,fill=0, fun.aggregate = sum)
head(df_w[1:2,])
row.names(df_w)<-df_w$Category

#Categories of interest:
all <- c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface","ALOHA_DCM", "ALOHA_150m", "SPOT_150m", "ALOHA_1000m", "SPOT_890m")
#
ALOHA <- c("ALOHA_surface","ALOHA_DCM", "ALOHA_150m", "ALOHA_1000m")
#
CA <- c("Catalina_surface", "PortofLA_surface", "SPOT_surface","SPOT_150m", "SPOT_890m")
#
surface_only <- c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface")
surface_wDCM <- c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface","ALOHA_DCM")
all_euphotic <- c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface","ALOHA_DCM", "ALOHA_150m")
subsurface <- c("ALOHA_DCM", "ALOHA_150m", "SPOT_150m", "ALOHA_1000m", "SPOT_890m")
subeuphotic <- c("SPOT_150m", "ALOHA_1000m", "SPOT_890m")
Using SUM as value column: use value.var to override.

A data.frame: 2 × 10
CategoryALOHA_1000mALOHA_150mALOHA_DCMALOHA_surfaceCatalina_surfacePortofLA_surfaceSPOT_150mSPOT_890mSPOT_surface
<fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1Actin polymerization258.0576281.3999377.7012 339.9071 659.4209 506.6605499.4695299.4016 540.7881
2AMT 632.2946574.0023957.92151182.26291166.09541411.6010679.6627778.01621223.3163
In [248]:
# Function to plot pheatmaps
pheat_bySAMPLE<-function(df, subset_samples, title){
  df_subset <- subset(df, SAMPLE %in% subset_samples)
  df_w<-dcast(df_subset,Category~SAMPLE,fill=0, fun.aggregate = sum)
  row.names(df_w)<-df_w$Category
  df_w<-df_w[subset_samples]
  df_m<-as.matrix(df_w)
  out<-pheatmap(df_m, scale = "row", cluster_cols = FALSE,cluster_rows = TRUE, cellwidth=9, cellheight = 9, main= title)
  return(out)
}
In [249]:
options(repr.plot.width = 6, repr.plot.height = 10)
a<-pheat_bySAMPLE(whole_targetSUM, all, "All")
# b<-pheat_bySAMPLE(whole_targetSUM, ALOHA, "ALOHA")
# c<-pheat_bySAMPLE(whole_targetSUM, CA, "CA")
# d<-pheat_bySAMPLE(whole_targetSUM, surface_only, "Surface only")
# e<-pheat_bySAMPLE(whole_targetSUM, surface_wDCM, "Surface w/ DCM")
# f<-pheat_bySAMPLE(whole_targetSUM, all_euphotic, "All euphotic")
# g<-pheat_bySAMPLE(whole_targetSUM, subsurface, "All subsurface")
# h<-pheat_bySAMPLE(whole_targetSUM, subeuphotic, "Subeuphotic")
Using SUM as value column: use value.var to override.

In [ ]:
# ## Save all pheatmap outputs
# save_pheatmap_png <- function(x, filename, width=1200, height=1000, res = 150) {
#   png(filename, width = width, height = height, res = res)
#   grid::grid.newpage()
#   grid::grid.draw(x$gtable)
#   dev.off()
# }
# save_pheatmap_png(a, "wholecommunity.png")
# save_pheatmap_png(b, "NPSG.png")
# save_pheatmap_png(c, "CA.png")
# save_pheatmap_png(d, "surface.png")
# save_pheatmap_png(e, "surface_wDCM.png")
# save_pheatmap_png(f, "euphotic.png")
# save_pheatmap_png(g, "subsurface.png")
# save_pheatmap_png(h, "subeuphotic.png")

Barplots of transcriptional profiles by taxonomic group

In [255]:
load("Normedbytax_annotated.RData",verbose=T)
# data includes normalized BY individual groups
# tax_wcat$Category<- NULL
head(tax_wcat[1:2,])
Loading objects:
  tax_wcat
A data.frame: 2 × 20
LocationDepthTaxonomyKOmean_countsampletaxaGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><chr><chr><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;Dinoflagellate;K000020.1031341ALOHA_1000mDinoflagellateAKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;Dinoflagellate;K000031.1344749ALOHA_1000mDinoflagellateE1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolismThreonine biosynthesis, aspartate => homoserine => threonineNANANANANANANANA
In [269]:
unique(tax_wcat$taxa); unique(tax_wcat$Category)
tax_wcat$Category <- as.character(tax_wcat$Category)
  1. 'Dinoflagellate'
  2. 'Ciliate'
  3. 'Haptophyta'
  4. 'Bacillariophyceae'
  5. 'Chlorophyta'
  6. 'Pelagophyceae'
  7. 'MAST'
  8. 'Rhizaria'
  1. <NA>
  2. Glyoxylate cycle
  3. TCA cycle
  4. Fatty acid biosynthesis
  5. additional breakdown
  6. Calvin cycle
  7. Gluconeogenesis
  8. Glycolysis
  9. PDH
  10. Fatty acid breakdown
  11. GS/GOGAT
  12. Inorganic N uptake and assimilation
  13. Triacylglycerol biosynthesis
  14. Chitinase
  15. phosphatidylinositol
  16. Endocytosis
  17. Phagosome maturation
  18. Motility and prey recognition
  19. P metabolism
  20. Lysosome binding and processing
  21. Organic N uptake and assimilation
  22. Urea cycle
  23. other
  24. V-type ATPase
  25. AMT
  26. por
  27. SNARE complex
  28. else
  29. Actin polymerization
  30. Nitrate reduction (assimilatory)
  31. NRT
  32. Photosynthesis
Levels:
  1. 'Actin polymerization'
  2. 'additional breakdown'
  3. 'AMT'
  4. 'Calvin cycle'
  5. 'Chitinase'
  6. 'else'
  7. 'Endocytosis'
  8. 'Fatty acid biosynthesis'
  9. 'Fatty acid breakdown'
  10. 'Gluconeogenesis'
  11. 'Glycolysis'
  12. 'Glyoxylate cycle'
  13. 'GS/GOGAT'
  14. 'Inorganic N uptake and assimilation'
  15. 'Lysosome binding and processing'
  16. 'Motility and prey recognition'
  17. 'Nitrate reduction (assimilatory)'
  18. 'NRT'
  19. 'Organic N uptake and assimilation'
  20. 'other'
  21. 'P metabolism'
  22. 'PDH'
  23. 'Phagosome maturation'
  24. 'phosphatidylinositol'
  25. 'Photosynthesis'
  26. 'por'
  27. 'SNARE complex'
  28. 'TCA cycle'
  29. 'Triacylglycerol biosynthesis'
  30. 'Urea cycle'
  31. 'V-type ATPase'
In [274]:
# Subset to relevant categories
rm <- c("butyryl", "other", "else", "additional breakdown", "por")
tax_plot_paths <- tax_wcat %>% 
    filter(!(Target == "Phagotrophy-other")) %>% 
    filter(!(Category %in% rm)) %>% 
    # Generate combined gluconeogenesus-glycolysis
    mutate(Category = case_when(
        grepl("Gluconeogenesis", Category) ~ "Gluconeogenesis-Glycolysis",
        grepl("Glycolysis", Category) ~ "Gluconeogenesis-Glycolysis",
        TRUE ~ Category)) %>% 
    select(Taxonomy, taxa, KO, mean_count, sample, Target, Category) %>% 
    filter(mean_count > 0) %>% 
    distinct() %>% 
    group_by(taxa, sample, Target, Category) %>%
    summarise(SUM_CPM = sum(mean_count)) %>%
    data.frame
head(tax_plot_paths[1:2,])
`summarise()` regrouping output by 'taxa', 'sample', 'Target' (override with `.groups` argument)

A data.frame: 2 × 5
taxasampleTargetCategorySUM_CPM
<chr><chr><fct><chr><dbl>
1BacillariophyceaeALOHA_1000mC metabolismCalvin cycle 2337.469
2BacillariophyceaeALOHA_1000mC metabolismGluconeogenesis-Glycolysis2587.927
In [340]:
# Factoring
sample_list<-c("Catalina_surface", "PortofLA_surface", "SPOT_surface", "ALOHA_surface","ALOHA_DCM", "ALOHA_150m", "SPOT_150m","SPOT_890m","ALOHA_1000m")
sample_label<-c("Catalina surface", "Port of LA surface", "SPOT surface", "ALOHA surface","ALOHA DCM", "ALOHA 150m", "SPOT 150m","SPOT 890m","ALOHA 1000m")
tax_plot_paths$SAMPLE_ORDER<-factor(tax_plot_paths$sample, levels=rev(sample_list), labels = rev(sample_label))

category_list <- c("Photosynthesis","Calvin cycle","Gluconeogenesis-Glycolysis","Glyoxylate cycle","TCA cycle","GS/GOGAT","AMT","NRT","Inorganic N uptake and assimilation","Organic N uptake and assimilation","Urea cycle","Nitrate reduction (assimilatory)","P metabolism","PDH","Actin polymerization","Lysosome binding and processing","Chitinase","Phagosome maturation","Endocytosis","Fatty acid biosynthesis","Fatty acid breakdown","Motility and prey recognition","SNARE complex","V-type ATPase")
category_color <- c("#ffffb2","#fed976","#feb24c","#fd8d3c","#f03b20","#bd0026","#f7fcb9","#addd8e","#41ab5d","#238443","#006837","#004529","#02818a","#a6bddb","#41b6c4","#1d91c0","#225ea8","#253494","#081d58","#f2f0f7","#cbc9e2","#9e9ac8","#756bb1","#54278f")
tax_plot_paths$CAT_ORDER <-factor(tax_plot_paths$Category, levels = rev(category_list))
names(category_color) <- (category_list)

# taxonomic order and label
tax_order<-c("Dinoflagellate","Ciliate","MAST","Rhizaria","Haptophyta","Bacillariophyceae","Pelagophyceae","Chlorophyta")
tax_label<-c("a. Dinoflagellates","b. Ciliates","c. MAST","d. Rhizaria","e. Haptophytes","f. Diatoms","g. Pelagophytes","h. Chlorophytes")
tax_plot_paths$TAX_ORDER <- factor(tax_plot_paths$taxa, levels= tax_order, labels = tax_label)

bar_cat <- ggplot(tax_plot_paths, aes(y=(SUM_CPM), x=SAMPLE_ORDER))+
  geom_bar(stat = "identity", position="fill", aes(fill=(CAT_ORDER)), color="#525252", width = 0.70, size = 0.2)+
  coord_flip() +
  scale_fill_manual(values = (category_color))+
  facet_wrap(.~TAX_ORDER, ncol = 4) +
  theme_minimal()+
  theme(legend.position = "bottom", 
        axis.text.y = element_text(color="black", hjust=1, face = "bold"),
        axis.title = element_text(color = "black", face = "bold"),
        strip.text = element_text(h=0, face="bold"), 
        legend.title = element_blank())+
  guides(fill = guide_legend(reverse = TRUE)) +
  labs(x="", y="Relative abundance CPM")
In [342]:
options(repr.plot.width = 12, repr.plot.height = 6)
# svg("barplot-bytax-allfxn.svg", w = 12, h = 6)
bar_cat
# dev.off()

Statistical test to compare transcriptional profiles among taxa

In [343]:
# load library ggpubr
library(ggpubr)
Attaching package: ‘ggpubr’


The following object is masked from ‘package:cowplot’:

    get_legend


In [345]:
# load("Normedbytax_annotated.RData", verbose = T)
head(tax_wcat[1:2,]); dim(tax_wcat)
A data.frame: 2 × 20
LocationDepthTaxonomyKOmean_countsampletaxaGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><chr><chr><fct><fct><fct><fct><fct><fct><fct><chr><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;Dinoflagellate;K000020.1031341ALOHA_1000mDinoflagellateAKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;Dinoflagellate;K000031.1344749ALOHA_1000mDinoflagellateE1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolismThreonine biosynthesis, aspartate => homoserine => threonineNANANANANANANANA
  1. 9397485
  2. 20
In [498]:
rm <- c("butyryl", "other", "else", "additional breakdown", "por")

# input_wilcox_df <- sample_n(tax_wcat, 500000) %>% #Test dataset
input_wilcox_df <- tax_wcat %>%  #FULL
    filter(!(Target == "Phagotrophy-other")) %>% 
    filter(!(Category %in% rm)) %>% 
    unite(tax_cat, Category, taxa, sep = "_", remove = FALSE) %>% 
    data.frame
In [499]:
wilcox_output <- compare_means(mean_count ~ sample, 
                     data = input_wilcox_df, 
                     method = "wilcox.test", 
                     group.by = "tax_cat",
                     p.adjust.method = "bonferroni")
head(wilcox_output)
table(wilcox_output$p.signif)
A tibble: 6 × 9
tax_cat.y.group1group2pp.adjp.formatp.signifmethod
<chr><chr><chr><chr><dbl><dbl><chr><chr><chr>
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mALOHA_150m 6.348820e-88 4.5e-84< 2e-16****Wilcoxon
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mALOHA_DCM 3.590977e-1642.6e-160< 2e-16****Wilcoxon
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mALOHA_surface 3.860091e-2592.7e-255< 2e-16****Wilcoxon
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mCatalina_surface3.716877e-2942.6e-290< 2e-16****Wilcoxon
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mPortofLA_surface4.332997e-3013.1e-297< 2e-16****Wilcoxon
Glyoxylate cycle_Dinoflagellatemean_countALOHA_1000mSPOT_150m 2.725496e-2921.9e-288< 2e-16****Wilcoxon
   *   **  *** ****   ns 
 462  476  356 4048 1762 
In [503]:
wilcox_output_wide <- wilcox_output %>% 
    separate(tax_cat, c("Category", "Taxa"), sep = "_") %>% 
    select(-.y., -p, -p.format, -method) %>% 
    unite(COMPARE, Taxa, group1, group2, sep = "-") %>% 
    pivot_wider(id_cols = COMPARE, names_from = Category, 
                values_from = c(p.adj, p.signif),
                names_glue = "{Category}_{.value}") %>% 
    column_to_rownames("COMPARE") %>% 
    select(sort(colnames(.))) %>% 
    rownames_to_column("COMPARE") %>% 
    separate(COMPARE, c("Taxa", "SampleA", "SampleB"), sep = "-") %>% 
    data.frame
head(wilcox_output_wide)
A data.frame: 6 × 53
TaxaSampleASampleBActin.polymerization_p.adjActin.polymerization_p.signifAMT_p.adjAMT_p.signifCalvin.cycle_p.adjCalvin.cycle_p.signifChitinase_p.adjPhotosynthesis_p.adjPhotosynthesis_p.signifSNARE.complex_p.adjSNARE.complex_p.signifTCA.cycle_p.adjTCA.cycle_p.signifUrea.cycle_p.adjUrea.cycle_p.signifV.type.ATPase_p.adjV.type.ATPase_p.signif
<chr><chr><chr><dbl><chr><dbl><chr><dbl><chr><dbl><dbl><chr><dbl><chr><dbl><chr><dbl><chr><dbl><chr>
1DinoflagellateALOHA_1000mALOHA_150m 1.5e-13****1.0e+00*** 2.7e-130****1.0e+001.0e+00** 1.5e-09****5.2e-164****2.4e-04**** 3.1e-28****
2DinoflagellateALOHA_1000mALOHA_DCM 1.9e-35****9.8e-04****7.2e-274****4.3e-027.6e-08****1.2e-33**** 0.0e+00****2.2e-16**** 6.8e-69****
3DinoflagellateALOHA_1000mALOHA_surface 8.5e-66****1.8e-05**** 0.0e+00****3.8e-021.2e-37****8.2e-57**** 0.0e+00****9.2e-28****8.3e-130****
4DinoflagellateALOHA_1000mCatalina_surface7.5e-68****1.2e-05**** 0.0e+00****6.0e-055.2e-43****9.5e-73**** 0.0e+00****3.5e-33****3.3e-133****
5DinoflagellateALOHA_1000mPortofLA_surface9.0e-75****1.3e-06**** 0.0e+00****2.9e-054.5e-46****1.7e-69**** 0.0e+00****1.1e-38****3.1e-143****
6DinoflagellateALOHA_1000mSPOT_150m 1.9e-73****2.6e-05**** 0.0e+00****1.3e-051.2e-01****9.9e-63**** 0.0e+00****1.5e-34****2.7e-107****
In [504]:
write_delim(wilcox_output_wide, path = "Wilcox-paired-output.txt", delim = "\t")

List top 10 transcripts per taxonomic group and category

In [444]:
# load("Normedbytax_annotated.RData", verbose = T)
head(tax_wcat[1:2,]); dim(tax_wcat)
A data.frame: 2 × 20
LocationDepthTaxonomyKOmean_countsampletaxaGene_identificationABCDTargetTarget_2CategoryGeneIDFull_GeneNamePhagotrophy_annotationNotesX
<chr><chr><fct><fct><dbl><chr><chr><fct><fct><fct><fct><fct><fct><fct><chr><fct><fct><fct><fct><fct>
1ALOHA1000mAlveolate;Dinoflagellate;K000020.1031341ALOHA_1000mDinoflagellateAKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]Pathway moduleCarbohydrate and lipid metabolism Other carbohydrate metabolism Glucuronate pathway (uronate pathway) NANANANANANANANA
2ALOHA1000mAlveolate;Dinoflagellate;K000031.1344749ALOHA_1000mDinoflagellateE1.1.1.3; homoserine dehydrogenase [EC:1.1.1.3] Pathway moduleNucleotide and amino acid metabolismSerine and threonine metabolismThreonine biosynthesis, aspartate => homoserine => threonineNANANANANANANANA
  1. 9397485
  2. 20
In [453]:
top10_bytax <- tax_wcat %>% 
    filter(!(Target == "Phagotrophy-other")) %>% 
    filter(!(Category %in% rm)) %>% 
    # Sum CPM across location and depth to find top transcripts per taxa
    group_by(Taxonomy, taxa, Gene_identification, KO, mean_count) %>% 
    distinct() %>% # Unduplicate kegg entries that fall into more than 1 category
    summarise(SUM = sum(mean_count)) %>% 
    group_by(taxa) %>% 
    arrange(taxa, desc(SUM)) %>% 
    top_n(10, SUM) %>% 
    data.frame
head(top10_bytax)
`summarise()` regrouping output by 'Taxonomy', 'taxa', 'Gene_identification', 'KO' (override with `.groups` argument)

A data.frame: 6 × 6
TaxonomytaxaGene_identificationKOmean_countSUM
<fct><chr><fct><fct><dbl><dbl>
1Stramenopile;Ochrophyta;Bacillariophyceae; BacillariophyceaePGK, pgk; phosphoglycerate kinase [EC:2.7.2.3] K009278190.115171992.42
2Stramenopile;Ochrophyta;Bacillariophyceae; BacillariophyceaeGAPDH, gapA; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]K001343728.042 78288.87
3Stramenopile;Ochrophyta;Bacillariophyceae; BacillariophyceaeFBA, fbaA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13] K016244100.281 73805.05
4Stramenopile;Ochrophyta;Bacillariophyceae;Incertae sedis;Chaetocerotaceae;Chaetoceros; BacillariophyceaeFBA, fbaA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13] K016242382.951 42893.12
5Stramenopile;Ochrophyta;Bacillariophyceae; BacillariophyceaePGK, pgk; phosphoglycerate kinase [EC:2.7.2.3] K009272020.526 42431.05
6Stramenopile;Ochrophyta;Bacillariophyceae;Incertae sedis;Chaetocerotaceae;Chaetoceros;Chaetoceros debilis;BacillariophyceaeFBA, fbaA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13] K016242334.073 42013.31
In [457]:
write_delim(top10_bytax, path = "top10-bytax.txt", delim = "\t")

END